{ "cells": [ { "cell_type": "markdown", "id": "7fb27b941602401d91542211134fc71a", "metadata": {}, "source": [ "# Fitting a Linear Simulation with XGBoost\n", "\n", "This notebook uses SHAP to demonstrate how XGBoost behaves when we fit it to simulated data where the label has a linear relationship to the features." ] }, { "cell_type": "code", "execution_count": 1, "id": "acae54e37e7d407bbb7b55eff062a284", "metadata": { "execution": { "iopub.execute_input": "2026-04-09T21:20:52.376972Z", "iopub.status.busy": "2026-04-09T21:20:52.376796Z", "iopub.status.idle": "2026-04-09T21:20:57.730665Z", "shell.execute_reply": "2026-04-09T21:20:57.729493Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import xgboost\n", "from sklearn.model_selection import train_test_split\n", "\n", "import shap" ] }, { "cell_type": "markdown", "id": "9a63283cbaf04dbcab1f6479b197f3a8", "metadata": {}, "source": [ "## Build a simulated dataset with linear labels" ] }, { "cell_type": "code", "execution_count": 2, "id": "8dd0d8092fe74a7c96281538738b07e2", "metadata": { "execution": { "iopub.execute_input": "2026-04-09T21:20:57.743309Z", "iopub.status.busy": "2026-04-09T21:20:57.741900Z", "iopub.status.idle": "2026-04-09T21:20:57.762771Z", "shell.execute_reply": "2026-04-09T21:20:57.762089Z" } }, "outputs": [], "source": [ "N = 10000\n", "M = 10\n", "np.random.seed(0)\n", "X_raw = np.random.randn(N, M)\n", "feature_names = [f\"feature {i}\" for i in range(M)]\n", "X = pd.DataFrame(X_raw, columns=feature_names)\n", "beta = np.random.randn(M)\n", "y = X_raw @ beta\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y)\n", "X_strain, X_valid, y_strain, y_valid = train_test_split(X_train, y_train)" ] }, { "cell_type": "markdown", "id": "72eea5119410473aa328ad9291626812", "metadata": {}, "source": [ "## Build an XGBoost regressor" ] }, { "cell_type": "markdown", "id": "8edb47106e1a46a883d545849b8ab81b", "metadata": {}, "source": [ "### Train a depth 1 model" ] }, { "cell_type": "code", "execution_count": 3, "id": "10185d26023b46108eb7d9f57d49d2b3", "metadata": { "execution": { "iopub.execute_input": "2026-04-09T21:20:57.765193Z", "iopub.status.busy": "2026-04-09T21:20:57.765046Z", "iopub.status.idle": "2026-04-09T21:21:05.128500Z", "shell.execute_reply": "2026-04-09T21:21:05.127546Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0]\tvalidation_0-rmse:2.17988\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalidation_0-rmse:0.95726\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[2000]\tvalidation_0-rmse:0.60452\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[3000]\tvalidation_0-rmse:0.41705\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[4000]\tvalidation_0-rmse:0.30822\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[5000]\tvalidation_0-rmse:0.24119\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[6000]\tvalidation_0-rmse:0.19857\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[7000]\tvalidation_0-rmse:0.17118\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[8000]\tvalidation_0-rmse:0.15386\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[9000]\tvalidation_0-rmse:0.14333\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[9999]\tvalidation_0-rmse:0.13717\n" ] }, { "data": { "text/html": [ "
XGBRegressor(base_score=np.float64(0.013271975120564444), booster=None,\n",
" callbacks=None, colsample_bylevel=None, colsample_bynode=None,\n",
" colsample_bytree=None, device=None, early_stopping_rounds=20,\n",
" enable_categorical=False, eval_metric=None, feature_types=None,\n",
" feature_weights=None, gamma=None, grow_policy=None,\n",
" importance_type=None, interaction_constraints=None,\n",
" learning_rate=0.01, max_bin=None, max_cat_threshold=None,\n",
" max_cat_to_onehot=None, max_delta_step=None, max_depth=1,\n",
" max_leaves=None, min_child_weight=None, missing=nan,\n",
" monotone_constraints=None, multi_strategy=None, n_estimators=10000,\n",
" n_jobs=None, num_parallel_tree=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=np.float64(0.013271975120564444), booster=None,\n",
" callbacks=None, colsample_bylevel=None, colsample_bynode=None,\n",
" colsample_bytree=0.5, device=None, early_stopping_rounds=20,\n",
" enable_categorical=False, eval_metric=None, feature_types=None,\n",
" feature_weights=None, gamma=None, grow_policy=None,\n",
" importance_type=None, interaction_constraints=None,\n",
" learning_rate=0.02, max_bin=None, max_cat_threshold=None,\n",
" max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n",
" max_leaves=None, min_child_weight=None, missing=nan,\n",
" monotone_constraints=None, multi_strategy=None, n_estimators=5000,\n",
" n_jobs=None, num_parallel_tree=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.